Consumer segmentation survey
segmentation <- read_csv (file = "http://goo.gl/qw303p" )
segmentation |> head (n = 5 )
# A tibble: 5 × 7
age gender income kids ownHome subscribe Segment
<dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
1 47.3 Male 49483. 2 ownNo subNo Suburb mix
2 31.4 Male 35546. 1 ownYes subNo Suburb mix
3 43.2 Male 44169. 0 ownYes subNo Suburb mix
4 37.3 Female 81042. 1 ownNo subNo Suburb mix
5 41.0 Female 79353. 3 ownYes subNo Suburb mix
segmentation |> count (Segment)
# A tibble: 4 × 2
Segment n
<chr> <int>
1 Moving up 70
2 Suburb mix 100
3 Travelers 80
4 Urban hip 50
segmentation |>
count (subscribe, ownHome) |>
pivot_wider (id_cols = subscribe,
names_from = ownHome,
values_from = n)
# A tibble: 2 × 3
subscribe ownNo ownYes
<chr> <int> <int>
1 subNo 137 123
2 subYes 22 18
Chi-squared test for given probabilities
\(H_0: p_1 = \frac{1}{4} \land p_2 = \frac{1}{4} \land p_3 = \frac{1}{4} \land p_4 = \frac{1}{4}\)
\(H_1: p_1 \neq \frac{1}{4} \lor p_2 \neq \frac{1}{4} \lor p_3 = \frac{1}{4} \lor p_4 \neq \frac{1}{4}\)
\(\begin{align}
\chi^2 & = \sum_{i=1}^n \frac{(Observed_i - Expected_i)^2}{Expected_i} \\
& = \frac{(70 - 300\frac{1}{4})^2}{300\frac{1}{4}} + \frac{(100 - 300\frac{1}{4})^2}{300\frac{1}{4}} +
\frac{(80 - 300\frac{1}{4})^2}{300\frac{1}{4}} +
\frac{(50 - 300\frac{1}{4})^2}{300\frac{1}{4}}
\end{align}\)
chi_statistic <- table (segmentation$ Segment) |>
chisq.test (p = c (1 / 4 , 1 / 4 , 1 / 4 , 1 / 4 ))
chi_statistic
Chi-squared test for given probabilities
data: table(segmentation$Segment)
X-squared = 17.333, df = 3, p-value = 0.0006035
Chi-squared test for given probabilities
Chi-squared test for given probabilities
\(H_0: p_1 = \frac{1}{4} \land p_2 = \frac{1}{4} \land p_3 = \frac{1}{4} \land p_4 = \frac{1}{4}\)
\(H_1: p_1 \neq \frac{1}{4} \lor p_2 \neq \frac{1}{4} \lor p_3 = \frac{1}{4} \lor p_4 \neq \frac{1}{4}\)
\(\begin{align}
\chi^2 & = \sum_{i=1}^n \frac{(Observed_i - Expected_i)^2}{Expected_i} \\
& = \frac{(70 - 300\frac{1}{4})^2}{300\frac{1}{4}} + \frac{(100 - 300\frac{1}{4})^2}{300\frac{1}{4}} + \frac{(80 - 300\frac{1}{4})^2}{300\frac{1}{4}} + \frac{(50 - 300\frac{1}{4})^2}{300\frac{1}{4}}
\end{align}\)
library (tidymodels)
segmentation |>
chisq_test (response = Segment,
p = c (1 / 4 , 1 / 4 , 1 / 4 , 1 / 4 ))
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <dbl> <dbl>
1 17.3 3 0.000603
Pearson’s Chi-squared test
\(H_0: p_{11} = \frac{260}{300}\frac{159}{300} \land p_{12} = \frac{260}{300}\frac{141}{300} \land p_{21} = \frac{40}{300}\frac{159}{300} \land p_{22} = \frac{40}{300}\frac{141}{300}\)
\(H_1: p_{11} \neq \frac{260}{300}\frac{159}{300} \lor p_{12} \neq \frac{260}{300}\frac{141}{300} \lor p_{21} \neq \frac{40}{300}\frac{159}{300} \lor p_{22} \neq \frac{40}{300}\frac{141}{300}\)
\(\begin{align}
\chi^2 & = \sum_{i=1}^n \frac{(Observed_i - Expected_i)^2}{Expected_i} \\
& = \frac{(137 - 300\frac{260}{300}\frac{159}{300})^2}{300\frac{260}{300}\frac{159}{300}} + \frac{(123 - 300\frac{260}{300}\frac{141}{300})^2}{300\frac{260}{300}\frac{141}{300}} \\
& \: \: \: + \frac{(22 - 300\frac{40}{300}\frac{159}{300})^2}{300\frac{40}{300}\frac{159}{300}} + \frac{(18 - 300\frac{40}{300}\frac{141}{300})^2}{300\frac{40}{300}\frac{141}{300}}
\end{align}\)
chi_statistic <- chisq.test (table (segmentation$ subscribe,
segmentation$ ownHome),
correct = FALSE )
chi_statistic
Pearson's Chi-squared test
data: table(segmentation$subscribe, segmentation$ownHome)
X-squared = 0.074113, df = 1, p-value = 0.7854
Pearson’s Chi-squared test
Pearson’s Chi-squared test
\(H_0: p_{11} = \frac{260}{300}\frac{159}{300} \land p_{12} = \frac{260}{300}\frac{141}{300} \land p_{21} = \frac{40}{300}\frac{159}{300} \land p_{22} = \frac{40}{300}\frac{141}{300}\)
\(H_1: p_{11} \neq \frac{260}{300}\frac{159}{300} \lor p_{12} \neq \frac{260}{300}\frac{141}{300} \lor p_{21} \neq \frac{40}{300}\frac{159}{300} \lor p_{22} \neq \frac{40}{300}\frac{141}{300}\)
\(\begin{align}
\chi^2 & = \sum_{i=1}^n \frac{(Observed_i - Expected_i)^2}{Expected_i} \\
& = \frac{(137 - 300\frac{260}{300}\frac{159}{300})^2}{300\frac{260}{300}\frac{159}{300}} + \frac{(123 - 300\frac{260}{300}\frac{141}{300})^2}{300\frac{260}{300}\frac{141}{300}} \\
& \: \: \: + \frac{(22 - 300\frac{40}{300}\frac{159}{300})^2}{300\frac{40}{300}\frac{159}{300}} + \frac{(18 - 300\frac{40}{300}\frac{141}{300})^2}{300\frac{40}{300}\frac{141}{300}}
\end{align}\)
segmentation |>
chisq_test (formula = subscribe ~ ownHome,
correct = FALSE )
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <int> <dbl>
1 0.0741 1 0.785
2 sample t-test: independent samples
segmentation |> ggplot () +
geom_histogram (aes (x = income), color= 'black' ) +
facet_wrap (facets = vars (ownHome))
2 sample t-test: independent samples
segmentation |>
group_by (ownHome) |>
summarise (mean_income = mean (income),
var_income = var (income),
n = n ())
# A tibble: 2 × 4
ownHome mean_income var_income n
<chr> <dbl> <dbl> <int>
1 ownNo 47391. 358692875. 159
2 ownYes 54935. 430890091. 141
2 sample t-test: independent samples
\(H_0: \mu_{ownNo} - \mu_{ownYes}= 0\) \(H_1: \mu_{ownNo} - \mu_{ownYes} \neq 0\)
\(t = \frac{\overline{ownNo} - \overline{ownYes}}{\sqrt{\frac{s_{ownNo}^2}{n_{ownNo}} - \frac{s_{ownYes}^2}{n_{ownYes}}}} = \frac{47391.01 - 54934.68}{\sqrt{ \frac{358692875}{159} - \frac{430890091}{141}}} \approx -3.273094\)
t_test <- t.test (income ~ ownHome, data = segmentation,
alternative= 'two.sided' , mu = 0 ,
conf.level = 0.95 )
t_test
Welch Two Sample t-test
data: income by ownHome
t = -3.2731, df = 285.25, p-value = 0.001195
alternative hypothesis: true difference in means between group ownNo and group ownYes is not equal to 0
95 percent confidence interval:
-12080.155 -3007.193
sample estimates:
mean in group ownNo mean in group ownYes
47391.01 54934.68
2 sample t-test: independent samples
\[c_L < \mu_{ownNo} - \mu_{ownYes} < c_U\]
\(\mu_{ownNo} - \mu_{ownYes}\) is not a random variable so we need to use a random variable
\[P \Biggr( t_L < \frac{\overline{x}_{ownNo} - \overline{x}_{ownYes} - (\mu_{ownNo} - \mu_{ownYes})}{\sqrt{\frac{s^2_{ownNo}}{n_{ownNo} } +\frac{s^2_{ownYes}}{n_{ownYes}}}} < t_U \Biggr) = 0.95\]
\(\overline{x}_{ownNo} - \overline{x}_{ownYes}\) is a random variable
t_L <- qt (p = 0.025 , df = 285.25 , lower.tail = TRUE )
t_L
t_U <- qt (p = 0.975 , df = 285.25 , lower.tail = TRUE )
t_U
\[P(-7543.674 - 1.968315\times2304.753 \\ < \mu_{ownNo} - \mu_{ownYes} < -7543.674 - 1.968315\times2304.753) = 0.95\] \[P(-12080.16 < \mu_{ownNo} - \mu_{ownYes} < -3007.193) = 0.95\]
In the long run 95% of confidence intervals constructed in this manner will contain the true parameter
2 sample t-test: independent samples
\(H_0: \mu_{ownNo} - \mu_{ownYes}= 0\)
\(H_1: \mu_{ownNo} - \mu_{ownYes} \neq 0\)
\(t = \frac{\overline{ownNo} - \overline{ownYes}}{\sqrt{\frac{s_{ownNo}^2}{n_{ownNo}} - \frac{s_{ownYes}^2}{n_{ownYes}}}} = \frac{47391.01 - 54934.68}{\sqrt{ \frac{358692875}{159} - \frac{430890091}{141}}} \approx -3.273094\)
segmentation |>
t_test (formula = income ~ ownHome,
alternative = "two-sided" ,
order = c ("ownNo" , "ownYes" ),
mu = 0 ,
conf_level = 0.95 )
# A tibble: 1 × 7
statistic t_df p_value alternative estimate lower_ci upper_ci
<dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 -3.27 285. 0.00119 two.sided -7544. -12080. -3007.
Testing Multiple Group Means: Analysis of Variance (ANOVA)
segmentation |>
group_by (Segment) |>
summarise (mean = mean (income),
variance = var (income),
n = n ())
# A tibble: 4 × 4
Segment mean variance n
<chr> <dbl> <dbl> <int>
1 Moving up 53091. 92862689. 70
2 Suburb mix 55034. 142761527. 100
3 Travelers 62214. 564173979. 80
4 Urban hip 21682. 23885953. 50
Testing Multiple Group Means: Analysis of Variance (ANOVA)
Testing Multiple Group Means: Analysis of Variance (ANOVA)
\(H_0: \mu_{Moving\;up} = \mu_{Suburb\;mix} = \mu_{Travelers} = \mu_{Urban\;hip}\)
\(H_1: \text{At least one group mean is different from the rest}\)
\(n = \sum_{j=1}^4 n_j = n_1 + \cdots + n_4 = 70 + 100 + 80 + 50 = 300\)
\(\overline{income} = \frac{1}{n} \sum_{j=1}^4 \sum_{i=1}^{n_j} income_{ij}\)
\(\overline{income}_j = \frac{1}{n_j} \sum_{i=1}^{n_j} income_{ij}\)
\(F = \frac{\frac{\sum_{j=1}^4 \sum_{i=1}^{n_j} (\overline{income}_j - \overline{income})^2}{4-1}}{\frac{\sum_{j=1}^4 \sum_{i=1}^{n_j} (income_{ij} - \overline{income}_j)^2}{300 - 4}} = \frac{\frac{54969675428}{3}}{\frac{66281072794}{296}} = \frac{18323225143}{223922543} = 81.82841\)
anova_table <- aov (data = segmentation, formula = income ~ Segment) |>
anova ()
anova_table
Analysis of Variance Table
Response: income
Df Sum Sq Mean Sq F value Pr(>F)
Segment 3 5.4970e+10 1.8323e+10 81.828 < 2.2e-16 ***
Residuals 296 6.6281e+10 2.2392e+08
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Testing Multiple Group Means: Analysis of Variance (ANOVA)
anova_table <- aov (data = segmentation, formula = income ~ Segment) |>
anova () |>
tidy ()
anova_table
# A tibble: 2 × 6
term df sumsq meansq statistic p.value
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 Segment 3 54969675428. 18323225143. 81.8 1.41e-38
2 Residuals 296 66281072794. 223922543. NA NA
Testing Multiple Group Means: Analysis of Variance (ANOVA)
segmentation |>
distinct (Segment) |>
arrange (Segment) |>
rowid_to_column (var = 'i' )
# A tibble: 4 × 2
i Segment
<int> <chr>
1 1 Moving up
2 2 Suburb mix
3 3 Travelers
4 4 Urban hip
segmentation |>
distinct (ownHome) |>
rowid_to_column (var = 'j' )
# A tibble: 2 × 2
j ownHome
<int> <chr>
1 1 ownNo
2 2 ownYes
Testing Multiple Group Means: Analysis of Variance (ANOVA)
segmentation |>
count (Segment, ownHome, name = "n_ij" )
# A tibble: 8 × 3
Segment ownHome n_ij
<chr> <chr> <int>
1 Moving up ownNo 47
2 Moving up ownYes 23
3 Suburb mix ownNo 52
4 Suburb mix ownYes 48
5 Travelers ownNo 20
6 Travelers ownYes 60
7 Urban hip ownNo 40
8 Urban hip ownYes 10
Testing Multiple Group Means: Analysis of Variance (ANOVA)
mu_ij <- segmentation |>
group_by (Segment, ownHome) |>
summarise (mean = mean (income)) |>
ungroup ()
mu_11 <- mu_ij$ mean[1 ]
mu_11
segmentation |>
select (income, Segment, ownHome) |>
head (n= 5 )
# A tibble: 5 × 3
income Segment ownHome
<dbl> <chr> <chr>
1 49483. Suburb mix ownNo
2 35546. Suburb mix ownYes
3 44169. Suburb mix ownYes
4 81042. Suburb mix ownNo
5 79353. Suburb mix ownYes
Testing Multiple Group Means: Analysis of Variance (ANOVA)
Testing Multiple Group Means: Analysis of Variance (ANOVA)
\[\begin{split}
income_{ijk} = & \mu + \alpha_i + \beta_j + (\alpha\beta)_{ij} + \epsilon_{ijk} \\
& \text{ where } + \epsilon_i \sim \mathcal{N}(0, \sigma^2) \\
& \text{ and } i = 1, 2, 3, 4 \\
& j = 1, 2 \\
& k = 1, \ldots n_{ij} \\
& \mu = \mu_{11} \\
& \alpha_1 = \beta_1 = 0 \\
& (\alpha\beta)_{11} = (\alpha\beta)_{12} = 0 \\
& (\alpha\beta)_{21} = (\alpha\beta)_{31} = (\alpha\beta)_{41} = 0 \\
\end{split}\]
Testing Multiple Group Means: Analysis of Variance (ANOVA)
\[\begin{split}
\widehat{income}_{ijk} = & \widehat{\mu} + \widehat{\alpha}_i + \widehat{\beta}_j + (\widehat{\alpha\beta})_{ij} + \widehat{\epsilon}_{ijk} \\
& \text{ and } i = 1, 2, 3, 4 \\
& j = 1, 2 \\
& k = 1, \ldots n_{ij} \\
& \widehat{\mu} = \widehat{\mu}_{11} \\
& \widehat{\alpha}_2, \widehat{\alpha}_3, \widehat{\alpha}_4 \\
& \widehat{\beta}_2 \\
& (\widehat{\alpha\beta})_{22}, (\widehat{\alpha\beta})_{32}, (\widehat{\alpha\beta})_{42}
\end{split}\]
\[income_{ijk} - \widehat{income}_{ijk} = \widehat{\epsilon}_{ijk}\]
Testing Multiple Group Means: Analysis of Variance (ANOVA)
segmentation |>
select (income, Segment, ownHome) |>
head (n= 2 ) |>
glimpse ()
Rows: 2
Columns: 3
$ income <dbl> 49482.81, 35546.29
$ Segment <chr> "Suburb mix", "Suburb mix"
$ ownHome <chr> "ownNo", "ownYes"
framed <- model_frame (formula = income ~
Segment +
ownHome +
Segment: ownHome,
data = segmentation)
model_matrix (terms = framed$ terms,
data = framed$ data) |>
head (n = 2 ) |>
glimpse ()
Rows: 2
Columns: 8
$ `(Intercept)` <dbl> 1, 1
$ `SegmentSuburb mix` <dbl> 1, 1
$ SegmentTravelers <dbl> 0, 0
$ `SegmentUrban hip` <dbl> 0, 0
$ ownHomeownYes <dbl> 0, 1
$ `SegmentSuburb mix:ownHomeownYes` <dbl> 0, 1
$ `SegmentTravelers:ownHomeownYes` <dbl> 0, 0
$ `SegmentUrban hip:ownHomeownYes` <dbl> 0, 0
Testing Multiple Group Means: Analysis of Variance (ANOVA)
model_aov <- aov (formula = income ~ Segment + ownHome + Segment: ownHome,
data = segmentation)
coef (model_aov) |> enframe (name = "coef" )
# A tibble: 8 × 2
coef value
<chr> <dbl>
1 (Intercept) 54498.
2 SegmentSuburb mix 435.
3 SegmentTravelers 8691.
4 SegmentUrban hip -33160.
5 ownHomeownYes -4281.
6 SegmentSuburb mix:ownHomeownYes 4492.
7 SegmentTravelers:ownHomeownYes 2982.
8 SegmentUrban hip:ownHomeownYes 6003.
Testing Multiple Group Means: Analysis of Variance (ANOVA)
Analysis of Variance Table
Response: income
Df Sum Sq Mean Sq F value Pr(>F)
Segment 3 5.4970e+10 1.8323e+10 81.1305 <2e-16 ***
ownHome 1 6.9918e+07 6.9918e+07 0.3096 0.5784
Segment:ownHome 3 2.6329e+08 8.7762e+07 0.3886 0.7613
Residuals 292 6.5948e+10 2.2585e+08
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Testing Multiple Group Means: Analysis of Variance (ANOVA)
model_aov <- lm (formula = income ~ - 1 + Segment,
data = segmentation) |>
tidy (conf.int = TRUE )